
rm(list=ls())

#####################
#The purpose of this file is to load up the datasets and clean them for processing.
#
# PrepDF1 - load and clean revisions data
# PrepDF2 - load and clean view/quality data
#
#####################

source('prepCategoryData.R') #remaking all the subfiles also
source('justDamaging.R')
source('prepUserData.R')

####set globals and make helpers
#basePath = '/home/kaylea/Research/taboo/'
basePath = '/gscratch/comdata/users/kaylea/taboo/'
#coefPath = paste0(basePath, 'processed_data/coefs/')
coefPath = paste0(basePath, 'processed_data/euph/')
ngramPath = paste0(basePath, 'processed_data/ngram/') #ngram lives here
dataPath = paste0(basePath, 'processed_data/')
rawPath = paste0(basePath, 'raw_data/')
botsFile <- paste0(rawPath, 'botList.tsv')
salienceFile = paste0(dataPath, 'grammifiedData/ngrams.tsv')
salientArticlesFile = paste0(dataPath, 'grammifiedData/salientArticles.tsv')
#spellsFile <- paste0(dataPath, 'protectionSpells.tsv')
endOfRecords= '2022-06-02 20:15:46' #derived from end of the action logs
endOfRecords = strptime(endOfRecords, "%Y-%m-%d %H:%M:%S")
startOfRecords= '2008-09-20 05:23:14'
startOfRecords = strptime(startOfRecords, "%Y-%m-%d %H:%M:%S")
load(paste0(dataPath, 'spells.RData'))



library(dplyr)
library(sqldf)
library(lubridate)
library(data.table)
library(urltools)


#recipe from https://www.r-bloggers.com/2011/06/merge-all-files-in-a-directory-using-r-into-a-single-dataframe/
readPileToDF <- function(path) {
  file_list <- list.files(path)
  print(file_list)
  for (my_file in file_list){
    if (my_file == '_SUCCESS') { #spark metadata file, ignore
      next
    }
    # if the merged dataset doesn't exist, create it
    if (!exists("dataset")) {
    print(paste0('Now Reading: ', path, my_file))
    dataset <- read.table(paste0(path, my_file), quote="\"", header=TRUE, sep="\t", stringsAsFactors=FALSE)
    }
    # if the merged dataset does exist, append to it
    if (exists("dataset")){
      temp_dataset <-read.table(paste0(path, my_file), quote="\"", header=TRUE, sep="\t", stringsAsFactors=FALSE)
      dataset<-rbind(dataset, temp_dataset)
      rm(temp_dataset)
    }
  }
  dataset <- unique(dataset)
  return(dataset)
}

# Part 1 - load and clean revisions data


#revDF.CTab = readPileToDF(paste0(coefPath, 'taboo/revDataPlusUPL/'))
revDF.CTab = readPileToDF(paste0(coefPath, 'revDataPlusUPL/'))
revDF.CTab$source <- "taboo" 
#revDF.CTab$taboo <- 1
revDF.CTab$userpage_text_chars[is.na(revDF.CTab$userpage_text_chars)] <- 0
head(revDF.CTab)

## problem: taboo data doesn't have all the fields we need. 
## solution: read in the counts from ngrams.tsv and the salientArticles file

ngramDF <- read.table(salienceFile, sep='\t', quote='"', header=TRUE, stringsAsFactors=FALSE) #ngram and count. ngram should be the same as stripped title 
ngramDF <- unique(ngramDF) #strip out any repetitions
salientDF <- read.table(salientArticlesFile, sep='\t', quote='"', header=TRUE, stringsAsFactors=FALSE) #ngram and count. ngram should be the same as stripped title 
salientDF <- unique(salientDF) #strip out any repetitions
salientDF <- merge(salientDF, ngramDF, by.x="filtered_title", by.y="ngram") ## glue together
n.salient <- length(salientDF$count) ## get the size of the salient dataset from here

revDF.CTab <- merge(revDF.CTab, salientDF, by="encodedTitle")

##drop unneeded fields
revDF.CTab$prediction <- NULL
revDF.CTab$filtered_title <- NULL
revDF.CTab$target <- NULL

revDF.NGram = readPileToDF(paste0(ngramPath,'revDataPlusUPL/'))
revDF.NGram$source <- "ngram" 
#revDF.NGram$taboo <- 0
revDF.NGram$userpage_text_chars[is.na(revDF.NGram$userpage_text_chars)] <- 0

##drop unneeded fields
revDF.NGram$prediction <- NULL
revDF.NGram$articleID.x <- NULL
revDF.NGram$articleID.y <- NULL
revDF.NGram$articleID_x <- NULL
revDF.NGram$articleID_y <- NULL
revDF.NGram$target_x <- NULL
revDF.NGram$target_y <- NULL
revDF.NGram$prediction.x <- NULL
revDF.NGram$prediction.y <- NULL
revDF.NGram$titlePred <- NULL
revDF.NGram$articlePred <- NULL
revDF.NGram$sha1 <- NULL
revDF.NGram$minor <- NULL
revDF.NGram$namespace <- NULL
revDF.NGram$chosenTarget <- NULL
revDF.NGram$target <- NULL
revDF.NGram$ngram <- NULL
revDF.NGram$filtered_title <- NULL

revDF.NGram$articleID <- revDF.CTab$articleID <- NULL #clear these out
revDF.NGram$index <- revDF.CTab$index <- NULL #clear these out

colnames(revDF.CTab)
colnames(revDF.NGram)

#drop any bad luck random draws
problems <- intersect(unique(revDF.NGram$encodedTitle), unique(revDF.CTab$encodedTitle)) 
problems
revDF.NGram <- subset(revDF.NGram, !(revDF.NGram$encodedTitle %in% problems))
problems <- intersect(unique(revDF.NGram$encodedTitle), unique(revDF.CTab$encodedTitle)) 
problems ##should be no doubles anymore

revDF <- rbind(revDF.NGram, revDF.CTab)
revDF <- unique(revDF)
revDF.NGram <- NULL
revDF.CTab <- NULL
head(revDF)

revDF <- revDF[!is.na(revDF$revid),] #drop any where revid is NA

revDF <- merge(x=revDF, y=userDF, by='editor', all.x=TRUE) #left (outer) join: all of revDF, plus any matches in userDF

## eliminate any articles in both:



### prepare bot filter
botDF <- read.table(botsFile, sep='\t', quote='"', header=TRUE, stringsAsFactors=FALSE)
botDF <- unique(botDF) #strip out any repetitions
#botRoleDF <- read.table(botsRoleFile, sep='\t', quote='"', header=TRUE, stringsAsFactors=FALSE)
head(revDF)
head(botDF)
botDF$editor_id <- as.character(botDF$BotUserID) #just to make sure
revDF$editor_id <- as.character(revDF$editor_id)
head(revDF)
head(botDF)
revDF <- setDT(revDF)
botDF <- setDT(botDF)
revDF <- revDF[,isBot :=FALSE][botDF, isBot := TRUE, on= .(editor_id)] # this means: set column isBot to False. then, set the isBot to TRUE if a join could happen '.' means list. 
isABot.tab <- table(revDF$isBot)

## drop all bots here
revDF.clean <- subset(revDF, revDF$isBot==FALSE)
revDF <- NULL #so we don't use it accidentally
revDF.clean$loggedIn <- !(as.logical(revDF.clean$anon))

##### filtering done, now to do some summing-up

### Weighting
##for each article, the weight of each revision for that article is (N_rev_total/N_total articles)/N_art_revnum
#two criteria this meets:
#sum(weights) = total_revs
#sum(weights for given article) = sum(weights for all other articles)

numEdits <- revDF.clean %>% group_by(encodedTitle) %>% dplyr::summarize(numEdits=length(revid)) ##articlewise revisions count
numEditors <- revDF.clean %>% group_by(encodedTitle) %>% dplyr::summarize(numEditors=length(unique(editor))) ###articlewise editors count, including IP addresses
n.revs <- length(revDF.clean$revid) ## total number of revisions
n.arts <- length(numEdits$encodedTitle) ## total number of articles
revDF.clean <- merge(revDF.clean, numEdits, by="encodedTitle")
revDF.clean <- merge(revDF.clean, numEditors, by="encodedTitle")
revDF.clean$weight <- (n.revs/n.arts)/revDF.clean$numEdits


revDF.clean$ngramWeight <- revDF.clean$count #wasn't very descriptive 
revDF.clean <- revDF.clean %>% mutate(got_reverted =          
                case_when(is.na(reverted_by) ~ FALSE, TRUE ~ TRUE))

table(revDF.clean$anon)
revDF.clean <- rbind(subset(revDF.clean, revDF.clean$anon=='true'), subset(revDF.clean, revDF.clean$anon=='false')) ##small number of NAs (187), look like parse problems 
table(revDF.clean$anon)


##### dropping items with missing revids; if this happens, find out why
###revDF.clean <- revDF.clean[!is.na(revDF.clean$revid)]




artDF <- revDF.clean %>% dplyr::group_by(encodedTitle) %>% dplyr::summarize(
        across(revid, length),
        across(got_reverted, sum),
	across(date_time, min)
)

titleSampleDF <- data.frame('encodedTitle' = revDF.clean$encodedTitle, 'source'=revDF.clean$source)
titleSampleDF <- unique(titleSampleDF)

artDF <- merge(artDF, titleSampleDF, by='encodedTitle', all.x=TRUE) #which sample is it from
artDF$min.birthday <- strptime(artDF$date_time,  "%Y-%m-%d %H:%M:%S")
artDF$startOfRecords <- startOfRecords
artDF$birthOrLog <- pmax(artDF$min.birthday, artDF$startOfRecords) #birthday or beginning of records, whichever comes later
artDF$secondsOldLog <- as.numeric(difftime(strptime(endOfRecords, "%Y-%m-%d %H:%M:%S"),strptime(artDF$birthOrLog, "%Y-%m-%d %H:%M:%S"), units="secs")) 
## how many seconds old is each article inside the logged scope? 


##################### 
## adding in "protection spell" information to the artDF

protDF <- subset(spells, spells$type=='edit')
### apply corrections to protection data to deal with the range of data we have
protDF$end <- replace(protDF$end, is.na(protDF$end), endOfRecords) #if it's NA, we set it to the end of recorded time
protDF$start <- replace(protDF$start, is.na(protDF$start), startOfRecords) #if it's NA, we set it to the end of recorded time
protDF$encodedTitle <- sub('_', ' ', protDF$title)
protDF$encodedTitle <- url_encode(protDF$encodedTitle) #URLencode is not vector friendly

## what if the protection went off and on prior to article deletion and re-creation events? the export of the article will not have the revisions made pre-deletion
protDF <- as.data.frame(protDF)
artDF <- as.data.frame(artDF)
protDF.clean <- merge(x=artDF, y=protDF, by = 'encodedTitle', type='left') 
protDF.clean <- subset(protDF.clean, protDF.clean$end > protDF.clean$min.birthday) #eliminate protection records that ended "before" the birth of the article
protDF.clean$start.clean <- pmax(protDF.clean$start, protDF.clean$min.birthday) ## which ever came later
protDF.clean <- subset(protDF.clean, protDF.clean$start == protDF.clean$start.clean)

protDF.clean$duration <- as.numeric(difftime(strptime(protDF.clean$end, "%Y-%m-%d %H:%M:%S"),strptime(protDF.clean$start.clean, "%Y-%m-%d %H:%M:%S"), units="secs")) ## how many seconds did each protection event last?

byArtProtDF <- protDF.clean %>% group_by(encodedTitle) %>% dplyr::summarize(across(duration, sum))  
artDF.prot <- merge(x=artDF, y=byArtProtDF, by = 'encodedTitle', type='left', all.x=TRUE)
artDF.prot$duration <- replace(artDF.prot$duration, is.na(artDF$duration), 0)## in any cases where duration is NA, make it 0

artDF.prot$pct.prot <- artDF.prot$duration/artDF.prot$secondsOldLog ## what proportion of its observed life was the article protected?

artDF$pct.prot <- artDF.prot$pct.prot

#print("saving just articles info")
#save(artDF, artDF.prot, file=paste0(dataPath, "artDF.RData"), version=2)
#print("saving just revDF.clean for viz work")
#save(revDF.clean, file=paste0(dataPath, "forViz1.RData"), version=2)
print("saving full image")
save.image(paste0(dataPath, "dataset1.RData"), version=2)
